Select high coverage cells, filter out the genes that do not have at least 10 counts in at least 10 cells.
data("fluidigm")
fluidigm_high <- fluidigm[,which(colData(fluidigm)$Coverage_Type=="High")]
filter <- apply(assay(fluidigm_high)>10, 1, sum)>=10
Number of retained genes:
print(sum(filter))
## [1] 6998
Fit a zinb model on the high coverage data (no normalization).
fluidigm_high <- fluidigm_high[filter,]
high <- assay(fluidigm_high)
set.seed(23424)
print(system.time(zinb_high <- zinbFit(high, ncores = 3, K = 2)))
## user system elapsed
## 670.571 72.622 296.439
Plot the results with cells colored according to their biological condition.
bio <- as.factor(colData(fluidigm_high)$Biological_Condition)
cl <- as.factor(colData(fluidigm_high)$Cluster2)
plot(zinb_high@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_high@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)
Compared to PCA (of the log counts)
pca <- prcomp(t(log1p(high)))
plot(pca$x, col=cols[bio], pch=19)
legend("topleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)
Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.
#first factor and total number of detected genes in the cell
cor(zinb_high@W[,1], colSums(high>0),method="spearman")
## [1] -0.2168706
#second factor and the total number of detected genes in the cell
cor(zinb_high@W[,2], colSums(high>0),method="spearman")
## [1] 0.5244318
#first factor and the total number of counts in the cell
cor(zinb_high@W[,1], colSums(high),method="spearman")
## [1] -0.1518794
#second factor and the total number of counts in the cell
cor(zinb_high@W[,2], colSums(high),method="spearman")
## [1] 0.3468531
Same for PCA
#first factor and total number of detected genes in the cell
cor(pca$x[,1], colSums(high>0), method="spearman")
## [1] 0.9232517
#second factor and the total number of detected genes in the cell
cor(pca$x[,2], colSums(high>0), method="spearman")
## [1] 0.297028
#first factor and the total number of counts in the cell
cor(pca$x[,1], colSums(high), method="spearman")
## [1] 0.1270105
#second factor and the total number of counts in the cell
cor(pca$x[,2], colSums(high), method="spearman")
## [1] -0.3141171
Correlation between PCA and ZINB
cor(pca$x[,1], zinb_high@W[,1], method="spearman")
## [1] -0.2783654
cor(pca$x[,2], zinb_high@W[,1], method="spearman")
## [1] -0.01188811
cor(pca$x[,1], zinb_high@W[,2], method="spearman")
## [1] 0.7553322
cor(pca$x[,2], zinb_high@W[,2], method="spearman")
## [1] -0.4728147
sce <- newSCESet(countData=data.frame(high))
sce <- computeSumFactors(sce, sizes=c(10, 20, 30))
sf <- sizeFactors(sce)
norm <- exprs(convertTo(sce, type="monocle"))
fq <- betweenLaneNormalization(high, which="full")
plotRLE(high, outline=FALSE, col=cols[bio], main="Unnormalized counts")
plotRLE(norm, outline=FALSE, col=cols[bio], main="SCRAN normalization")
plotRLE(fq, outline=FALSE, col=cols[bio], main="FQ normalization")
I’m not convinced that this is the best normalization, but since it’s the only one specifically proposed for scRNA-seq, it’s a good place to start.
set.seed(3525)
offsets <- matrix(rep(sf, NROW(high)), ncol=NROW(high), nrow=NCOL(high))
print(system.time(zinb_norm <- zinbFit(high, ncores = 3, K = 2, O_mu = offsets)))
## user system elapsed
## 748.390 80.517 318.133
Plot the results with cells colored according to their biological condition.
plot(zinb_norm@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_norm@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)
qc <- as.matrix(colData(fluidigm_high)[,metadata(fluidigm_high)$which_qc])
qcpca <- prcomp(qc, scale=TRUE)
quality <- qcpca$x[,1]
detection_rate <- colSums(high>0)
data.frame(W1=zinb_norm@W[,1], W2=zinb_norm@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()
Compared to PCA (of the log counts)
pca_norm <- prcomp(t(log1p(norm)))
plot(pca_norm$x, col=cols[bio], pch=19)
legend("bottomleft", levels(bio), fill=cols)
plot(pca_norm$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)
Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.
#first factor and total number of detected genes in the cell
cor(zinb_norm@W[,1], colSums(high>0),method="spearman")
## [1] -0.2240385
#second factor and the total number of detected genes in the cell
cor(zinb_norm@W[,2], colSums(high>0),method="spearman")
## [1] 0.4842657
#first factor and the total number of counts in the cell
cor(zinb_norm@W[,1], colSums(high),method="spearman")
## [1] -0.1604021
#second factor and the total number of counts in the cell
cor(zinb_norm@W[,2], colSums(high),method="spearman")
## [1] 0.3326923
Same for PCA
#first factor and total number of detected genes in the cell
cor(pca_norm$x[,1], colSums(high>0), method="spearman")
## [1] 0.9566434
#second factor and the total number of detected genes in the cell
cor(pca_norm$x[,2], colSums(high>0), method="spearman")
## [1] -0.1056818
#first factor and the total number of counts in the cell
cor(pca_norm$x[,1], colSums(high), method="spearman")
## [1] -0.07137238
#second factor and the total number of counts in the cell
cor(pca_norm$x[,2], colSums(high), method="spearman")
## [1] 0.4917832
Correlation between PCA and ZINB
cor(pca_norm$x[,1], zinb_norm@W[,1], method="spearman")
## [1] -0.2262675
cor(pca_norm$x[,2], zinb_norm@W[,1], method="spearman")
## [1] 0.004195804
cor(pca_norm$x[,1], zinb_norm@W[,2], method="spearman")
## [1] 0.6608392
cor(pca_norm$x[,2], zinb_norm@W[,2], method="spearman")
## [1] 0.6752622
se <- newSeqExpressionSet(high)
se <- betweenLaneNormalization(se, which="full", offset=TRUE)
offsets <- t(offst(se))
set.seed(35235)
print(system.time(zinb_fq <- zinbFit(high, ncores = 3, K = 2, O_mu = offsets)))
## user system elapsed
## 841.972 87.240 353.576
Plot the results with cells colored according to their biological condition.
plot(zinb_fq@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_fq@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)
data.frame(W1=zinb_fq@W[,1], W2=zinb_fq@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()
Compared to PCA (of the log counts)
pca_fq <- prcomp(t(log1p(fq)))
plot(pca_fq$x, col=cols[bio], pch=19)
legend("topright", levels(bio), fill=cols)
plot(pca_fq$x, col=cols2[cl], pch=19)
legend("topright", levels(cl), fill=cols2)
Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.
#first factor and total number of detected genes in the cell
cor(zinb_fq@W[,1], colSums(high>0),method="spearman")
## [1] 0.08378497
#second factor and the total number of detected genes in the cell
cor(zinb_fq@W[,2], colSums(high>0),method="spearman")
## [1] 0.3754808
#first factor and the total number of counts in the cell
cor(zinb_fq@W[,1], colSums(high),method="spearman")
## [1] -0.1039336
#second factor and the total number of counts in the cell
cor(zinb_fq@W[,2], colSums(high),method="spearman")
## [1] 0.3515297
Same for PCA
#first factor and total number of detected genes in the cell
cor(pca_fq$x[,1], colSums(high>0), method="spearman")
## [1] 0.7178322
#second factor and the total number of detected genes in the cell
cor(pca_fq$x[,2], colSums(high>0), method="spearman")
## [1] 0.6338724
#first factor and the total number of counts in the cell
cor(pca_fq$x[,1], colSums(high), method="spearman")
## [1] 0.2144231
#second factor and the total number of counts in the cell
cor(pca_fq$x[,2], colSums(high), method="spearman")
## [1] -0.2588287
Correlation between PCA and ZINB
cor(pca_fq$x[,1], zinb_fq@W[,1], method="spearman")
## [1] -0.002097902
cor(pca_fq$x[,2], zinb_fq@W[,1], method="spearman")
## [1] 0.1466783
cor(pca_fq$x[,1], zinb_fq@W[,2], method="spearman")
## [1] 0.7437937
cor(pca_fq$x[,2], zinb_fq@W[,2], method="spearman")
## [1] -0.1988199
Select low coverage cells, filter out the genes that do not have at least 10 counts in at least 10 cells.
fluidigm_low <- fluidigm[,which(colData(fluidigm)$Coverage_Type=="Low")]
filter <- apply(assay(fluidigm_low)>10, 1, sum)>=10
Number of retained genes:
print(sum(filter))
## [1] 6998
Fit a zinb model on the low coverage data (no normalization).
fluidigm_low <- fluidigm_low[filter,]
low <- assay(fluidigm_low)
set.seed(654)
print(system.time(zinb_low <- zinbFit(low, ncores = 3, K = 2)))
## user system elapsed
## 379.503 60.009 163.815
Plot the results with cells colored according to their biological condition.
bio <- as.factor(colData(fluidigm_low)$Biological_Condition)
cl <- as.factor(colData(fluidigm_low)$Cluster2)
plot(zinb_low@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_low@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)
Compared to PCA (of the log counts)
pca <- prcomp(t(log1p(low)))
plot(pca$x, col=cols[bio], pch=19)
legend("topleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)
Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.
#first factor and total number of detected genes in the cell
cor(zinb_low@W[,1], colSums(low>0),method="spearman")
## [1] 0.4277894
#second factor and the total number of detected genes in the cell
cor(zinb_low@W[,2], colSums(low>0),method="spearman")
## [1] -0.5360534
#first factor and the total number of counts in the cell
cor(zinb_low@W[,1], colSums(low),method="spearman")
## [1] 0.0005681818
#second factor and the total number of counts in the cell
cor(zinb_low@W[,2], colSums(low),method="spearman")
## [1] 0.2904283
Same for PCA
#first factor and total number of detected genes in the cell
cor(pca$x[,1], colSums(low>0), method="spearman")
## [1] -0.6268781
#second factor and the total number of detected genes in the cell
cor(pca$x[,2], colSums(low>0), method="spearman")
## [1] -0.03942437
#first factor and the total number of counts in the cell
cor(pca$x[,1], colSums(low), method="spearman")
## [1] 0.06979895
#second factor and the total number of counts in the cell
cor(pca$x[,2], colSums(low), method="spearman")
## [1] -0.474257
Correlation between PCA and ZINB
cor(pca$x[,1], zinb_low@W[,1], method="spearman")
## [1] -0.8020979
cor(pca$x[,2], zinb_low@W[,1], method="spearman")
## [1] -0.06770105
cor(pca$x[,1], zinb_low@W[,2], method="spearman")
## [1] 0.8125874
cor(pca$x[,2], zinb_low@W[,2], method="spearman")
## [1] -0.4075612
sce <- newSCESet(countData=data.frame(low))
sce <- computeSumFactors(sce, sizes=c(10, 20, 30))
sf <- sizeFactors(sce)
norm <- exprs(convertTo(sce, type="monocle"))
plotRLE(low, outline=FALSE, col=cols[bio], main="Unnormalized counts")
plotRLE(norm, outline=FALSE, col=cols[bio], main="SCRAN normalization")
set.seed(3525)
offsets <- matrix(rep(sf, NROW(low)), ncol=NROW(low), nrow=NCOL(low))
print(system.time(zinb_norm <- zinbFit(low, ncores = 3, K = 2, O_mu = offsets)))
## user system elapsed
## 266.052 45.152 117.459
Plot the results with cells colored according to their biological condition.
plot(zinb_norm@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_norm@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)
qc <- as.matrix(colData(fluidigm_low)[,metadata(fluidigm_low)$which_qc])
qcpca <- prcomp(qc, scale=TRUE)
quality <- qcpca$x[,1]
detection_rate <- colSums(low>0)
data.frame(W1=zinb_norm@W[,1], W2=zinb_norm@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()
Compared to PCA (of the log counts)
pca_norm <- prcomp(t(log1p(norm)))
plot(pca_norm$x, col=cols[bio], pch=19)
legend("bottomleft", levels(bio), fill=cols)
plot(pca_norm$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)
Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.
#first factor and total number of detected genes in the cell
cor(zinb_norm@W[,1], colSums(low>0),method="spearman")
## [1] 0.02917491
#second factor and the total number of detected genes in the cell
cor(zinb_norm@W[,2], colSums(low>0),method="spearman")
## [1] -0.5416043
#first factor and the total number of counts in the cell
cor(zinb_norm@W[,1], colSums(low),method="spearman")
## [1] -0.03199301
#second factor and the total number of counts in the cell
cor(zinb_norm@W[,2], colSums(low),method="spearman")
## [1] 0.1593969
Same for PCA
#first factor and total number of detected genes in the cell
cor(pca_norm$x[,1], colSums(low>0), method="spearman")
## [1] -0.8312772
#second factor and the total number of detected genes in the cell
cor(pca_norm$x[,2], colSums(low>0), method="spearman")
## [1] 0.5694024
#first factor and the total number of counts in the cell
cor(pca_norm$x[,1], colSums(low), method="spearman")
## [1] 0.4491259
#second factor and the total number of counts in the cell
cor(pca_norm$x[,2], colSums(low), method="spearman")
## [1] -0.5332168
Correlation between PCA and ZINB
cor(pca_norm$x[,1], zinb_norm@W[,1], method="spearman")
## [1] 0.2093969
cor(pca_norm$x[,2], zinb_norm@W[,1], method="spearman")
## [1] 0.4050699
cor(pca_norm$x[,1], zinb_norm@W[,2], method="spearman")
## [1] 0.859965
cor(pca_norm$x[,2], zinb_norm@W[,2], method="spearman")
## [1] 0.1836538
x <- model.matrix(~scale(detection_rate))
set.seed(9948)
print(system.time(zinb_norm <- zinbFit(low, ncores = 3, K = 2, X = x, which_X_mu=1:2, which_X_pi=1L)))
## user system elapsed
## 381.644 47.689 170.539
plot(zinb_norm@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_norm@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)
data.frame(W1=zinb_norm@W[,1], W2=zinb_norm@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()